library(readr)
library(mosaic)
library(car)
library(tidyverse)
library(plotly)
library(reshape2)
library(pander)

Model Creation

Data Manipulation

# Data Manipulation/Wrangling

houses <- read.csv("train.csv", header=TRUE)

houses$totalsqft <- houses$X1stFlrSF + houses$X2ndFlrSF  + houses$TotalBsmtSF + houses$GarageArea



houses <- houses %>% 
  mutate(hoodwealth = case_when(
         Neighborhood == "BrDale" | Neighborhood == "IDDTRR" | Neighborhood == "OldTown" | Neighborhood == "SWISU" | Neighborhood == "IDOTRR" ~ "Lower Class",
         Neighborhood == "Veenker" | Neighborhood == "Somerst" | Neighborhood == "Gilbert" | Neighborhood == "Crawfor" | Neighborhood == "Timber" | Neighborhood == "CollgCr" | Neighborhood == "ClearCr" | Neighborhood == "NoRidge" | Neighborhood == "NridgHt"  | Neighborhood == "StoneBr" ~ "Upper Class",
          TRUE ~ "Middle Class"))

houses <- houses %>% 
  filter(totalsqft < 8000)

houses$hoodwealth <- factor(houses$hoodwealth)

houses <- houses %>% 
  mutate(qualGroup = case_when(
    OverallQual == "1" | OverallQual == "2" | OverallQual == "3" | OverallQual == "4" | OverallQual == "5" ~ "poor",
    OverallQual == "7" | OverallQual == "8" | OverallQual == "9" | OverallQual == "10"  | OverallQual == "6" ~ "good"))

houses$qualGroup <- factor(houses$qualGroup)
#houses <- houses %>% 
# select(c(totalsqft, SalePrice, Neighborhood, YearBuilt, hoodwealth, qualGroup, OverallQual))

plot(SalePrice ~ totalsqft, data = houses)

#[1] "GarageType"   "GarageYrBlt"  "GarageFinish" "GarageCars"   "GarageArea"   "GarageQual"   "GarageCond"  

This section was created and edited throughout the process of creating the regression model. The first variable that was created was the total square foot column. This was made by adding the square footage of the first floor, second floor, basement, and garage together. Given that the first three rules of real estate are location, the neighborhood was taken into consideration. 3 groups were created, based on the residuals of each of the included neighborhood compared to a regression model only including the total square footage as the explanatory variable. This is shown below:

Initial Models and Plots

mylm <- lm(SalePrice ~ totalsqft, data = houses)

plot(mylm$residuals ~ as.factor(Neighborhood), data = houses)
abline(h=0)

Any neighborhoods that were shown to have mainly positive residuals were placed in a group called upper class, those with mostly negative residuals were classified as lower class, and the others were labeled middle class. This new factor column with 3 levels was then added to the regression model, creating a 3-lines quadratic model, given that a quadratic term was also introduced to see if it provided a better fit.

house.lm1 <- lm(SalePrice ~ totalsqft + I(totalsqft^2) + hoodwealth, data = houses)
summary(house.lm1)
## 
## Call:
## lm(formula = SalePrice ~ totalsqft + I(totalsqft^2) + hoodwealth, 
##     data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172247  -16765    1056   16944  201835 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             7.632e+04  7.598e+03  10.045  < 2e-16 ***
## totalsqft              -1.376e+01  4.576e+00  -3.007  0.00268 ** 
## I(totalsqft^2)          1.166e-02  6.586e-04  17.698  < 2e-16 ***
## hoodwealthMiddle Class  1.626e+04  2.782e+03   5.846 6.22e-09 ***
## hoodwealthUpper Class   5.499e+04  3.030e+03  18.147  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33750 on 1453 degrees of freedom
## Multiple R-squared:  0.8203, Adjusted R-squared:  0.8198 
## F-statistic:  1658 on 4 and 1453 DF,  p-value: < 2.2e-16
#house.lm1 <- lm(SalePrice ~ totalsqft + I(totalsqft^2) + LotArea, data = houses)
#summary(house.lm1)

Try a 3-Dimensional Model

A few different approaches were used, including adding an interaction, an interaction with the quadratic term, and adding a second continuous variable, but it was decided to just keep the interactions and quadratics at this point.

## 
## Call:
## lm(formula = SalePrice ~ totalsqft + hoodwealth + totalsqft:hoodwealth, 
##     data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -171593  -17090    1277   16353  240601 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        7687.017   8493.083   0.905    0.366    
## totalsqft                            45.497      3.219  14.133  < 2e-16 ***
## hoodwealthMiddle Class            14729.164   9789.067   1.505    0.133    
## hoodwealthUpper Class            -70819.329  10219.610  -6.930 6.32e-12 ***
## totalsqft:hoodwealthMiddle Class      0.719      3.645   0.197    0.844    
## totalsqft:hoodwealthUpper Class      39.614      3.579  11.068  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33670 on 1452 degrees of freedom
## Multiple R-squared:  0.8212, Adjusted R-squared:  0.8206 
## F-statistic:  1334 on 5 and 1452 DF,  p-value: < 2.2e-16

At this point, I wanted to make sure I was keeping up with the changes being made and thus created a plot of my current regression model:

house.lm3 <- lm(SalePrice ~ totalsqft + hoodwealth + totalsqft:hoodwealth + I(totalsqft^2) + hoodwealth:I(totalsqft^2), data = houses)
summary(house.lm3)
## 
## Call:
## lm(formula = SalePrice ~ totalsqft + hoodwealth + totalsqft:hoodwealth + 
##     I(totalsqft^2) + hoodwealth:I(totalsqft^2), data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -193619  -15844    1132   15365  208365 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            8.389e+04  2.019e+04   4.155 3.44e-05
## totalsqft                             -1.100e+01  1.404e+01  -0.784  0.43323
## hoodwealthMiddle Class                -5.387e+04  2.330e+04  -2.312  0.02090
## hoodwealthUpper Class                 -4.956e+01  2.518e+04  -0.002  0.99843
## I(totalsqft^2)                         9.568e-03  2.319e-03   4.126 3.90e-05
## totalsqft:hoodwealthMiddle Class       5.162e+01  1.617e+01   3.193  0.00144
## totalsqft:hoodwealthUpper Class        1.490e+01  1.611e+01   0.925  0.35495
## hoodwealthMiddle Class:I(totalsqft^2) -8.610e-03  2.680e-03  -3.213  0.00134
## hoodwealthUpper Class:I(totalsqft^2)   9.708e-04  2.528e-03   0.384  0.70100
##                                          
## (Intercept)                           ***
## totalsqft                                
## hoodwealthMiddle Class                *  
## hoodwealthUpper Class                    
## I(totalsqft^2)                        ***
## totalsqft:hoodwealthMiddle Class      ** 
## totalsqft:hoodwealthUpper Class          
## hoodwealthMiddle Class:I(totalsqft^2) ** 
## hoodwealthUpper Class:I(totalsqft^2)     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32310 on 1449 degrees of freedom
## Multiple R-squared:  0.8357, Adjusted R-squared:  0.8348 
## F-statistic: 921.1 on 8 and 1449 DF,  p-value: < 2.2e-16
b <- coef(house.lm3)
plot(SalePrice ~ totalsqft, col = as.factor(hoodwealth), data = houses, pch = 19)
curve(b[1] + b[2]*totalsqft + b[5]*I(totalsqft^2), add = TRUE, xname = "totalsqft")
curve(b[1] + b[2]*totalsqft + b[5]*I(totalsqft^2) + 
      b[3] + b[6]*totalsqft + b[8]*I(totalsqft^2), add = TRUE, xname = "totalsqft", col = "red")
curve(b[1] + b[2]*totalsqft + b[5]*I(totalsqft^2) + 
      b[3] + b[6]*totalsqft + b[8]*I(totalsqft^2) + 
      b[4] + b[7]*totalsqft + b[9]*I(totalsqft^2)  , add = TRUE, xname = "totalsqft", col = "green3")

#plot(house.lm3$residuals ~ YearBuilt, data = houses)
#house.lm4 <- lm(SalePrice ~ I(totalsqft^2) + hoodwealth + totalsqft:hoodwealth + hoodwealth:I(totalsqft^2) + houses$YearBuilt, data = houses)
#summary(house.lm4)

#plot(house.lm3$residuals ~ ., data = houses)

Eventually, another new variable was created using the overall quality rating of the house. The original dataset was on a 1-10 scale, but was used to create a binary variable, which defined a rating of 1-5 as poor, and a rating of 6-10 as good. Once this was incorporated into the model, the previously used variable regarding the neighborhood was deemed insignificant due to the summary output of the model, and the fact that the adjusted \(R^2\) value was not significantly negatively affected.

After creating about 8 different models using variables such as the Year Built, garage size, exterior quality and various transformations, a final model was selected. Only a few of the models have been shown here due to the length and breadth, but can be shown if desired.

#One last try
#plot(house.lm8d$residuals ~ ., data = houses)
house.lm9 <- lm(sqrt(sqrt(SalePrice)) ~ totalsqft   + qualGroup + totalsqft:qualGroup + ExterQual + YearBuilt, data = houses)
summary(house.lm9)
## 
## Call:
## lm(formula = sqrt(sqrt(SalePrice)) ~ totalsqft + qualGroup + 
##     totalsqft:qualGroup + ExterQual + YearBuilt, data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9594 -0.4157  0.0433  0.4744  3.3919 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -5.243e+00  1.859e+00  -2.821  0.00486 ** 
## totalsqft                1.564e-03  3.603e-05  43.417  < 2e-16 ***
## qualGrouppoor            5.506e-01  1.803e-01   3.054  0.00230 ** 
## ExterQualFa             -2.655e+00  2.626e-01 -10.113  < 2e-16 ***
## ExterQualGd             -1.004e+00  1.270e-01  -7.905 5.28e-15 ***
## ExterQualTA             -1.395e+00  1.392e-01 -10.018  < 2e-16 ***
## YearBuilt                1.127e-02  9.270e-04  12.153  < 2e-16 ***
## totalsqft:qualGrouppoor -4.091e-04  6.538e-05  -6.258 5.13e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.815 on 1450 degrees of freedom
## Multiple R-squared:  0.8429, Adjusted R-squared:  0.8421 
## F-statistic:  1111 on 7 and 1450 DF,  p-value: < 2.2e-16
house.lm10 <- lm(sqrt(sqrt(SalePrice)) ~ totalsqft + qualGroup + totalsqft:qualGroup + YearBuilt, data = houses)
summary(house.lm10)
## 
## Call:
## lm(formula = sqrt(sqrt(SalePrice)) ~ totalsqft + qualGroup + 
##     totalsqft:qualGroup + YearBuilt, data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1611 -0.4462  0.0459  0.4929  3.5418 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.324e+01  1.696e+00  -7.805 1.13e-14 ***
## totalsqft                1.724e-03  3.361e-05  51.302  < 2e-16 ***
## qualGrouppoor            8.172e-01  1.823e-01   4.483 7.92e-06 ***
## YearBuilt                1.446e-02  8.673e-04  16.672  < 2e-16 ***
## totalsqft:qualGrouppoor -5.349e-04  6.536e-05  -8.183 5.97e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8518 on 1453 degrees of freedom
## Multiple R-squared:  0.828,  Adjusted R-squared:  0.8276 
## F-statistic:  1749 on 4 and 1453 DF,  p-value: < 2.2e-16

Final Regression Model

The Final regression model includes a \(1/4\) power transformation on the Y value of Sale Price, predicted on two continuous variables, total square foot and year built, as well as the binary variable of quality and the interaction of that variable with total square foot, with all variables being significant:

final.lm <- lm(sqrt(sqrt(SalePrice)) ~ totalsqft + qualGroup + totalsqft:qualGroup +  YearBuilt , data = houses)
summary(final.lm) %>% pander(caption = "Final Regression Model")
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -13.24 1.696 -7.805 1.132e-14
totalsqft 0.001724 3.361e-05 51.3 0
qualGrouppoor 0.8172 0.1823 4.483 7.925e-06
YearBuilt 0.01446 0.0008673 16.67 3.087e-57
totalsqft:qualGrouppoor -0.0005349 6.536e-05 -8.183 5.971e-16
Final Regression Model
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1458 0.8518 0.828 0.8276
graph_resoy <- 1
graph_resox <- 500

axis_x <- seq(min(houses$totalsqft), max(houses$totalsqft), by = graph_resox)
axis_y <- seq(min(houses$YearBuilt), max(houses$YearBuilt), by = graph_resoy)

house_surface <- expand.grid(totalsqft = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F)
tmp <- house_surface
tmp$qualGroup <- as.factor("good")
house_surface$Z <- predict.lm(final.lm, newdata = tmp)^4
house_surface <- acast(house_surface, YearBuilt ~ totalsqft, value.var = "Z") #y ~ x

house_surface2 <- expand.grid(totalsqft = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F)
tmp <- house_surface2
tmp$qualGroup <- as.factor("poor")
house_surface2$Z <- predict.lm(final.lm, newdata = tmp)^4
house_surface2 <- acast(house_surface2, YearBuilt ~ totalsqft, value.var = "Z") #y ~ x

myplot <- plot_ly(houses, 
        x = ~totalsqft, 
        y = ~YearBuilt, 
        z = ~SalePrice,
        text = rownames(houses), 
        type = "scatter3d", 
        mode = "markers",
        color = ~ qualGroup) 
myplot <- myplot %>%
  add_trace(z = house_surface,
            x = axis_x,
            y = axis_y,
            type = "surface") 
myplot <- myplot %>% 
  add_trace(z = house_surface2,
            x = axis_x,
            y = axis_y,
            type = "surface",
            showlegend = FALSE)
myplot

Diagnostic Plots

par(mfrow=c(1,3))
plot(final.lm, which = 1)
qqPlot(final.lm$residuals)
## [1]  632 1323
plot(final.lm$residuals)

par(mfrow=c(1,2))
plot(final.lm, which = c(4,5))

Considering these diagnostic plots, we can see that there may be some other variable that is affecting the Sale Price due to the lack of complete linearity in the model, and the QQ Plot shows that the residuals are not normally distributed, though they show much improvement using the \(1/4\) power transformation over the non-transformed Sale Price.

Point 186 does also appear to possibly be an outlier, but considering its Cook’s Distance and leverage, we do not consider it an outlier.

Validation

set.seed(121)

num_rows <- 1000 #1460 total
keep <- sample(1:nrow(houses), num_rows)

mytrain <- houses[keep, ] #Use this in the lm(..., data=mytrain)

mytest <- houses[-keep, ] #Use this in the predict(..., newdata=mytest)
final.lm <- lm(sqrt(sqrt(SalePrice)) ~ totalsqft + qualGroup + totalsqft:qualGroup + YearBuilt, data = mytrain)
yhattest <- predict(final.lm, newdata = mytest)^4
ybartest <- mean(mytest$SalePrice)

SSTO <- sum( (mytest$SalePrice - ybartest)^2)
SSE <- sum( (mytest$SalePrice - yhattest)^2)
rsq <- 1-SSE/SSTO

n <- length(mytest$SalePrice)
p <- length(coef(final.lm))
adrsq <- 1 - (n-1)/(n-p)*SSE/SSTO
  my_output_table2 <- data.frame(Model = "True Regression Model", `Original R2` = summary(final.lm)$r.squared, `Orig. Adj. R-squared` = summary(final.lm)$adj.r.squared, `Validation R-squared` = rsq, `Validation Adj. R^2` = adrsq)

colnames(my_output_table2) <- c("Model", "Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")

knitr::kable(my_output_table2, escape=TRUE, digits=4)
Model Original \(R^2\) Original Adj. \(R^2\) Validation \(R^2\) Validation Adj. \(R^2\)
True Regression Model 0.8309 0.8302 0.8057 0.804

Interpretation

The final model being a two-plane model makes interpretation a bit tricky, but definitely doable. It is also called a double contour plot, due to the fact that the interaction of the total square footage and the quality group has been allowed. A few interpretations are given below:

An increase of 1 square foot gives an increase of \(0.001724\) square root square root dollars in the sale price, with all other variables being held constant.

An increase of 1 in the year built variable, or a house being 1 year younger, gives an increase in the sale price of \(0.01446\) square root square root dollars, all other variables being held constant.

A change from the quality group of poor to good impacts the effect of total square footage, meaning that if the quality group is poor, a 1 square foot increase in a house produces an increase of only \(0.0011891\) square root square root dollars, as opposed to the increase of \(0.001724\) square root square root dollars when the quality group is good.

An increase of 1 unit for both the total square foot variable and the year built variable gives an increase of \(0.016184\) square root square root dollars, assuming the quality group is good. If it is poor, then the increase of both the same variables by 1 unit increases the sale price of the house by only \(0.0156491\) square root square root dollars.

Predictions and prediction intervals have been included below to demonstrate the changes that are made by one variable at a time:

Total Square Foot

predict(final.lm, data.frame(totalsqft = 1200, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Total Square Footage = 1200")
Total Square Footage = 1200
fit lwr upr
101889 68094 146931
predict(final.lm, data.frame(totalsqft = 1300, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Total Square Footage = 1300")
Total Square Footage = 1300
fit lwr upr
104725 70215 150621
predict(final.lm, data.frame(totalsqft = 1400, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Total Square Footage = 1400")
Total Square Footage = 1400
fit lwr upr
107620 72382 154384
predict(final.lm, data.frame(totalsqft = 1500, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Total Square Footage = 1500")
Total Square Footage = 1500
fit lwr upr
110575 74598 158221

Year Built

predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "poor", YearBuilt = 1990), interval = "prediction")^4 %>% pander(caption = "Year Built = 1990")
Year Built = 1990
fit lwr upr
122253 83420 173286
predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "poor", YearBuilt = 1995), interval = "prediction")^4 %>% pander(caption = "Year Built = 1995")
Year Built = 1995
fit lwr upr
124250 84909 175897
predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Year Built = 2000")
Year Built = 2000
fit lwr upr
126271 86417 178540
predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "poor", YearBuilt = 2005), interval = "prediction")^4 %>% pander(caption = "Year Built = 2005")
Year Built = 2005
fit lwr upr
128317 87943 181215

Quality Group “Good”

predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "poor", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Quality = Poor")
Quality = Poor
fit lwr upr
126271 86417 178540
predict(final.lm, data.frame(totalsqft = 2000, qualGroup = "good", YearBuilt = 2000), interval = "prediction")^4 %>% pander(caption = "Quality = Good")
Quality = Good
fit lwr upr
136881 94426 192260
predict(final.lm, data.frame(totalsqft = 2500, qualGroup = "poor", YearBuilt = 2005), interval = "prediction")^4 %>% pander(caption = "Quality = Poor")
Quality = Poor
fit lwr upr
145834 101226 203777
predict(final.lm, data.frame(totalsqft = 2500, qualGroup = "good", YearBuilt = 2005), interval = "prediction")^4 %>% pander(caption = "Quality = Good")
Quality = Good
fit lwr upr
164387 115440 227461

Due to these interpretations and predictions, we can conclude that the Year Built variable has the greatest effect on the Sale price of a home. This also makes sense, because a one unit change in year, meaning a home is an entire year younger, is a lot more logically significant than a single unit increase in square footage when considering the selling price of a house.